Dataset: https://raw.githubusercontent.com/shrikant-temburwar/Wine-Quality-Dataset/master/winequality-red.csv
from IPython import display
display.Image("image.png")
# Pandas and Numpy
import pandas as pd
import numpy as np
# Visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# For Q-Q Plot
import scipy.stats as stats
# To ignore warnings
import warnings
warnings.filterwarnings('ignore')
# Machine Learning libraries
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
### To be able to see maximum columns on screen
pd.set_option('display.max_columns', 500)
dataset=pd.read_csv('https://raw.githubusercontent.com/shrikant-temburwar/Wine-Quality-Dataset/master/winequality-red.csv', sep=';')
dataset.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
### getting unique values for quality feature
dataset.quality.unique()
array([5, 6, 7, 4, 8, 3], dtype=int64)
### getting count of record for each unique value in quality
dataset.quality.value_counts()
5 681 6 638 7 199 4 53 8 18 3 10 Name: quality, dtype: int64
# getting null values and datatypes of all features
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1599 non-null float64 1 volatile acidity 1599 non-null float64 2 citric acid 1599 non-null float64 3 residual sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free sulfur dioxide 1599 non-null float64 6 total sulfur dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
### checking duplicate values
dataset.duplicated().sum()
240
### dropping duplicate values
dataset.drop_duplicates(inplace=True)
### checking duplicate values
dataset.duplicated().sum()
0
### checking different value counts
dataset['quality'].value_counts()
5 577 6 535 7 167 4 53 8 17 3 10 Name: quality, dtype: int64
### getting info about numerical features
dataset.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| fixed acidity | 1359.0 | 8.310596 | 1.736990 | 4.60000 | 7.1000 | 7.9000 | 9.20000 | 15.90000 |
| volatile acidity | 1359.0 | 0.529478 | 0.183031 | 0.12000 | 0.3900 | 0.5200 | 0.64000 | 1.58000 |
| citric acid | 1359.0 | 0.272333 | 0.195537 | 0.00000 | 0.0900 | 0.2600 | 0.43000 | 1.00000 |
| residual sugar | 1359.0 | 2.523400 | 1.352314 | 0.90000 | 1.9000 | 2.2000 | 2.60000 | 15.50000 |
| chlorides | 1359.0 | 0.088124 | 0.049377 | 0.01200 | 0.0700 | 0.0790 | 0.09100 | 0.61100 |
| free sulfur dioxide | 1359.0 | 15.893304 | 10.447270 | 1.00000 | 7.0000 | 14.0000 | 21.00000 | 72.00000 |
| total sulfur dioxide | 1359.0 | 46.825975 | 33.408946 | 6.00000 | 22.0000 | 38.0000 | 63.00000 | 289.00000 |
| density | 1359.0 | 0.996709 | 0.001869 | 0.99007 | 0.9956 | 0.9967 | 0.99782 | 1.00369 |
| pH | 1359.0 | 3.309787 | 0.155036 | 2.74000 | 3.2100 | 3.3100 | 3.40000 | 4.01000 |
| sulphates | 1359.0 | 0.658705 | 0.170667 | 0.33000 | 0.5500 | 0.6200 | 0.73000 | 2.00000 |
| alcohol | 1359.0 | 10.432315 | 1.082065 | 8.40000 | 9.5000 | 10.2000 | 11.10000 | 14.90000 |
| quality | 1359.0 | 5.623252 | 0.823578 | 3.00000 | 5.0000 | 6.0000 | 6.00000 | 8.00000 |
### getting null values in each feature
dataset.isnull().sum()
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
### getting correlation between independent and dependent features
corr=round(dataset.corr(),2)
corr
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| fixed acidity | 1.00 | -0.26 | 0.67 | 0.11 | 0.09 | -0.14 | -0.10 | 0.67 | -0.69 | 0.19 | -0.06 | 0.12 |
| volatile acidity | -0.26 | 1.00 | -0.55 | -0.00 | 0.06 | -0.02 | 0.07 | 0.02 | 0.25 | -0.26 | -0.20 | -0.40 |
| citric acid | 0.67 | -0.55 | 1.00 | 0.14 | 0.21 | -0.05 | 0.05 | 0.36 | -0.55 | 0.33 | 0.11 | 0.23 |
| residual sugar | 0.11 | -0.00 | 0.14 | 1.00 | 0.03 | 0.16 | 0.20 | 0.32 | -0.08 | -0.01 | 0.06 | 0.01 |
| chlorides | 0.09 | 0.06 | 0.21 | 0.03 | 1.00 | 0.00 | 0.05 | 0.19 | -0.27 | 0.39 | -0.22 | -0.13 |
| free sulfur dioxide | -0.14 | -0.02 | -0.05 | 0.16 | 0.00 | 1.00 | 0.67 | -0.02 | 0.06 | 0.05 | -0.08 | -0.05 |
| total sulfur dioxide | -0.10 | 0.07 | 0.05 | 0.20 | 0.05 | 0.67 | 1.00 | 0.08 | -0.08 | 0.04 | -0.22 | -0.18 |
| density | 0.67 | 0.02 | 0.36 | 0.32 | 0.19 | -0.02 | 0.08 | 1.00 | -0.36 | 0.15 | -0.50 | -0.18 |
| pH | -0.69 | 0.25 | -0.55 | -0.08 | -0.27 | 0.06 | -0.08 | -0.36 | 1.00 | -0.21 | 0.21 | -0.06 |
| sulphates | 0.19 | -0.26 | 0.33 | -0.01 | 0.39 | 0.05 | 0.04 | 0.15 | -0.21 | 1.00 | 0.09 | 0.25 |
| alcohol | -0.06 | -0.20 | 0.11 | 0.06 | -0.22 | -0.08 | -0.22 | -0.50 | 0.21 | 0.09 | 1.00 | 0.48 |
| quality | 0.12 | -0.40 | 0.23 | 0.01 | -0.13 | -0.05 | -0.18 | -0.18 | -0.06 | 0.25 | 0.48 | 1.00 |
### getting list of numerical features
numerical_features=dataset.columns
print(numerical_features)
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
### getting count of unique value in each feature
for feature in numerical_features:
print("'{}' has '{}' No. of unique values".format(feature, dataset[feature].nunique()))
'fixed acidity' has '96' No. of unique values 'volatile acidity' has '143' No. of unique values 'citric acid' has '80' No. of unique values 'residual sugar' has '91' No. of unique values 'chlorides' has '153' No. of unique values 'free sulfur dioxide' has '60' No. of unique values 'total sulfur dioxide' has '144' No. of unique values 'density' has '436' No. of unique values 'pH' has '89' No. of unique values 'sulphates' has '96' No. of unique values 'alcohol' has '65' No. of unique values 'quality' has '6' No. of unique values
### visualising count of quality which is discrete feature
sns.countplot(data=dataset, x='quality')
<AxesSubplot:xlabel='quality', ylabel='count'>
### Getting list of continuous features as only discrete feature is quality
continuous_features=[feature for feature in numerical_features if dataset[feature].nunique()>6]
print(continuous_features)
['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
### Checking distribution of Continuous numerical features
for feature in continuous_features:
plt.figure(figsize=(15,6))
plt.subplot(121)
sns.histplot(data=dataset, x=feature, kde=True, bins=30)
plt.title("{}'s distribution".format(feature),fontweight="bold", fontsize=15)
plt.subplot(122)
stats.probplot(dataset[feature], dist='norm', plot=plt)
plt.title("{}'s Q-Q Plot".format(feature),fontweight="bold", fontsize=15)
plt.show();
### Comparing Continuous numerical features with quality feature
palette1=sns.color_palette("tab10", 6)
plt.figure(figsize=(15,45))
for feature in enumerate(continuous_features):
plt.subplot(6, 2, feature[0]+1)
sns.set(rc={'figure.figsize':(7,7)})
sns.kdeplot(data=dataset, x=feature[1], hue='quality', palette=palette1, fill=True)
plt.title("{} Vs quality".format(feature[1]),fontweight="bold", fontsize=15)
### Checking outliers in numerical features
plt.figure(figsize=(20,40))
for feature in enumerate(continuous_features):
plt.subplot(6, 2, feature[0]+1)
sns.set(rc={'figure.figsize':(10,6)})
sns.boxplot(data=dataset, x=feature[1], color='y')
plt.title("{}".format(feature[1]), fontweight="bold", fontsize=15)
### getting outliers in features for each unique value in quality feature
plt.figure(figsize=(20,40))
for feature in enumerate(continuous_features):
plt.subplot(6, 2, feature[0]+1)
sns.set(rc={'figure.figsize':(10,6)})
sns.boxplot(data=dataset, y=feature[1], x='quality')
plt.title("{} vs quality".format(feature[1]),fontsize=15, fontweight="bold")
### visualising data scatter in each continuous feature with respect to quality
plt.figure(figsize=(20,50))
for feature in enumerate(continuous_features):
plt.subplot(6, 2, feature[0]+1)
sns.set(rc={'figure.figsize':(7,8)})
sns.stripplot(data=dataset, y=feature[1], x='quality')
plt.title("{} Vs quality".format(feature[1]),fontsize=15, fontweight="bold")
### plotting regplot for features vs modified quality
plt.figure(figsize=(20,55))
for feature in enumerate(continuous_features):
plt.subplot(6, 2, feature[0]+1)
sns.set(rc={'figure.figsize':(8,9)})
sns.regplot(data=dataset, x=feature[1], y='quality')
plt.xlabel(feature[1])
plt.ylabel("quality")
plt.title("{} Vs quality".format(feature[1]), fontweight='bold', fontsize=15)
### masking rare categories having values less than 20 percent as a new category 9
frequencies = dataset['quality'].value_counts(normalize=True)
mapping=dataset['quality'].map(frequencies)
dataset['quality']=dataset['quality'].mask(mapping<0.2, 9)
dataset['quality'].value_counts()
5 577 6 535 9 247 Name: quality, dtype: int64
### visualising count of modified quality which is discrete feature
sns.set(rc={'figure.figsize':(7,5)})
sns.countplot(data=dataset, x='quality')
<AxesSubplot:xlabel='quality', ylabel='count'>
### Comparing Continuous numerical features with modified quality feature
palette1=sns.color_palette("tab10", 3)
plt.figure(figsize=(15,45))
for feature in enumerate(continuous_features):
plt.subplot(6, 2, feature[0]+1)
sns.set(rc={'figure.figsize':(7,7)})
sns.kdeplot(data=dataset, x=feature[1], hue='quality', palette=palette1, fill=True)
plt.title("{} Vs quality".format(feature[1]),fontweight="bold", fontsize=15)
### getting outliers in features for each unique value in modified quality feature
plt.figure(figsize=(20,40))
for feature in enumerate(continuous_features):
plt.subplot(6, 2, feature[0]+1)
sns.set(rc={'figure.figsize':(10,6)})
sns.boxplot(data=dataset, y=feature[1], x='quality')
plt.title("{} vs quality".format(feature[1]),fontsize=15, fontweight="bold")
### getting correlation between independent and modified dependent features
corr=round(dataset.corr(),2)
corr
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| fixed acidity | 1.00 | -0.26 | 0.67 | 0.11 | 0.09 | -0.14 | -0.10 | 0.67 | -0.69 | 0.19 | -0.06 | 0.08 |
| volatile acidity | -0.26 | 1.00 | -0.55 | -0.00 | 0.06 | -0.02 | 0.07 | 0.02 | 0.25 | -0.26 | -0.20 | -0.17 |
| citric acid | 0.67 | -0.55 | 1.00 | 0.14 | 0.21 | -0.05 | 0.05 | 0.36 | -0.55 | 0.33 | 0.11 | 0.14 |
| residual sugar | 0.11 | -0.00 | 0.14 | 1.00 | 0.03 | 0.16 | 0.20 | 0.32 | -0.08 | -0.01 | 0.06 | 0.05 |
| chlorides | 0.09 | 0.06 | 0.21 | 0.03 | 1.00 | 0.00 | 0.05 | 0.19 | -0.27 | 0.39 | -0.22 | -0.09 |
| free sulfur dioxide | -0.14 | -0.02 | -0.05 | 0.16 | 0.00 | 1.00 | 0.67 | -0.02 | 0.06 | 0.05 | -0.08 | -0.12 |
| total sulfur dioxide | -0.10 | 0.07 | 0.05 | 0.20 | 0.05 | 0.67 | 1.00 | 0.08 | -0.08 | 0.04 | -0.22 | -0.23 |
| density | 0.67 | 0.02 | 0.36 | 0.32 | 0.19 | -0.02 | 0.08 | 1.00 | -0.36 | 0.15 | -0.50 | -0.17 |
| pH | -0.69 | 0.25 | -0.55 | -0.08 | -0.27 | 0.06 | -0.08 | -0.36 | 1.00 | -0.21 | 0.21 | 0.02 |
| sulphates | 0.19 | -0.26 | 0.33 | -0.01 | 0.39 | 0.05 | 0.04 | 0.15 | -0.21 | 1.00 | 0.09 | 0.17 |
| alcohol | -0.06 | -0.20 | 0.11 | 0.06 | -0.22 | -0.08 | -0.22 | -0.50 | 0.21 | 0.09 | 1.00 | 0.42 |
| quality | 0.08 | -0.17 | 0.14 | 0.05 | -0.09 | -0.12 | -0.23 | -0.17 | 0.02 | 0.17 | 0.42 | 1.00 |
### Plotting heatmap for visualising the correlation between features
sns.set(rc={'figure.figsize':(15,10)})
sns.heatmap(data=corr, annot=True, vmin=-1, vmax=1, cmap="YlGnBu")
<AxesSubplot:>
### plotting regplot for features vs modified quality
plt.figure(figsize=(20,40))
for feature in enumerate(continuous_features):
plt.subplot(6, 2, feature[0]+1)
sns.set(rc={'figure.figsize':(8,7)})
sns.regplot(data=dataset, x=feature[1], y='quality')
plt.xlabel(feature[1])
plt.ylabel("quality")
plt.title("{} Vs quality".format(feature[1]), fontweight='bold', fontsize=15)
### exporting dataset to csv
dataset.to_csv('winedataset.csv')
dataset['quality'].value_counts()
5 577 6 535 9 247 Name: quality, dtype: int64
X=dataset.iloc[:,:-1]
y=dataset.iloc[:,-1]
X.head(3)
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 |
y.head(3)
0 5 1 5 2 5 Name: quality, dtype: int64
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=10)
X_train.head(3)
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 390 | 5.6 | 0.85 | 0.05 | 1.4 | 0.045 | 12.0 | 88.0 | 0.9924 | 3.56 | 0.82 | 12.9 |
| 654 | 8.6 | 0.47 | 0.47 | 2.4 | 0.074 | 7.0 | 29.0 | 0.9979 | 3.08 | 0.46 | 9.5 |
| 895 | 7.1 | 0.59 | 0.01 | 2.3 | 0.080 | 27.0 | 43.0 | 0.9955 | 3.42 | 0.58 | 10.7 |
y_train.head(3)
390 9 654 5 895 6 Name: quality, dtype: int64
X_test.head(3)
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1008 | 8.9 | 0.350 | 0.40 | 3.6 | 0.110 | 12.0 | 24.0 | 0.99549 | 3.23 | 0.70 | 12.0 |
| 1163 | 9.0 | 0.785 | 0.24 | 1.7 | 0.078 | 10.0 | 21.0 | 0.99692 | 3.29 | 0.67 | 10.0 |
| 689 | 8.1 | 0.380 | 0.48 | 1.8 | 0.157 | 5.0 | 17.0 | 0.99760 | 3.30 | 1.05 | 9.4 |
y_test.head(3)
1008 9 1163 5 689 5 Name: quality, dtype: int64
### Creating copy of test and training data for feature scaling
X_train1=X_train
X_train1.head(3)
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 390 | 5.6 | 0.85 | 0.05 | 1.4 | 0.045 | 12.0 | 88.0 | 0.9924 | 3.56 | 0.82 | 12.9 |
| 654 | 8.6 | 0.47 | 0.47 | 2.4 | 0.074 | 7.0 | 29.0 | 0.9979 | 3.08 | 0.46 | 9.5 |
| 895 | 7.1 | 0.59 | 0.01 | 2.3 | 0.080 | 27.0 | 43.0 | 0.9955 | 3.42 | 0.58 | 10.7 |
X_test1=X_test
X_test1.head(3)
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1008 | 8.9 | 0.350 | 0.40 | 3.6 | 0.110 | 12.0 | 24.0 | 0.99549 | 3.23 | 0.70 | 12.0 |
| 1163 | 9.0 | 0.785 | 0.24 | 1.7 | 0.078 | 10.0 | 21.0 | 0.99692 | 3.29 | 0.67 | 10.0 |
| 689 | 8.1 | 0.380 | 0.48 | 1.8 | 0.157 | 5.0 | 17.0 | 0.99760 | 3.30 | 1.05 | 9.4 |
y_train1=y_train
y_train1.head(3)
390 9 654 5 895 6 Name: quality, dtype: int64
y_test1=y_test
y_test1.head(3)
1008 9 1163 5 689 5 Name: quality, dtype: int64
scalar=StandardScaler()
scalar
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
X_train1=scalar.fit_transform(X_train1)
X_train1
array([[-1.55901763e+00, 1.76931216e+00, -1.15352302e+00, ...,
1.58656678e+00, 1.04003958e+00, 2.22677443e+00],
[ 1.33664887e-01, -3.07322897e-01, 9.81976515e-01, ...,
-1.46311310e+00, -1.23393896e+00, -8.64908567e-01],
[-7.12676372e-01, 3.48456596e-01, -1.35690393e+00, ...,
6.97076813e-01, -4.75946114e-01, 2.26273669e-01],
...,
[ 2.46510388e-01, -2.52674606e-01, 6.76905153e-01, ...,
-1.80815989e-03, -7.91776467e-01, 4.44099629e-02],
[-1.27690388e+00, -1.01775068e+00, -1.36618481e-01, ...,
3.15866828e-01, -1.36027110e+00, -3.19317449e-01],
[-9.94790125e-01, 9.49587798e-01, -1.00098734e+00, ...,
8.24146808e-01, -4.75946114e-01, -2.28385596e-01]])
X_test1=scalar.transform(X_test)
X_test1
array([[ 0.30293314, -0.96310239, 0.62605993, ..., -0.51008814,
0.28204673, 1.40838776],
[ 0.35935589, 1.41409827, -0.18746371, ..., -0.12887815,
0.09254852, -0.4102493 ],
[-0.14844887, -0.79915752, 1.03282174, ..., -0.06534316,
2.4928592 , -0.95584042],
...,
[-0.09202612, -0.08872973, -0.18746371, ..., 0.18879683,
-0.09694969, -0.86490857],
[-0.48698537, -0.19802631, -0.44168984, ..., 0.50647182,
-0.7286104 , -0.68304486],
[ 1.0364289 , -0.58056435, 1.49042879, ..., -1.71725309,
-0.16011576, -0.77397671]])
dtc=DecisionTreeClassifier()
dtc
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier()
dtc.fit(X_train, y_train)
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier()
dtc_prediction=dtc.predict(X_test)
dtc_prediction
array([9, 5, 6, 5, 6, 5, 6, 6, 6, 9, 5, 5, 6, 5, 5, 5, 6, 9, 9, 6, 5, 6,
9, 5, 5, 5, 5, 6, 5, 5, 9, 6, 6, 6, 5, 9, 6, 5, 9, 5, 5, 6, 6, 6,
5, 9, 9, 6, 5, 6, 6, 5, 6, 6, 9, 6, 6, 9, 6, 9, 6, 6, 9, 5, 6, 5,
5, 6, 5, 5, 5, 6, 6, 6, 5, 9, 9, 5, 6, 5, 5, 5, 5, 5, 5, 5, 6, 9,
5, 5, 5, 5, 5, 6, 5, 5, 5, 6, 5, 6, 6, 5, 9, 6, 5, 6, 9, 6, 6, 5,
5, 5, 6, 5, 5, 9, 5, 9, 5, 6, 9, 9, 5, 6, 6, 5, 5, 5, 5, 5, 6, 6,
5, 5, 9, 9, 5, 5, 6, 6, 6, 5, 9, 6, 5, 5, 5, 6, 9, 6, 5, 9, 6, 6,
9, 6, 6, 5, 6, 6, 5, 6, 5, 5, 6, 9, 5, 5, 5, 5, 6, 5, 5, 6, 6, 9,
6, 6, 9, 5, 5, 6, 6, 5, 5, 5, 5, 6, 6, 5, 5, 6, 6, 6, 9, 5, 6, 5,
9, 9, 6, 5, 5, 6, 9, 6, 6, 6, 6, 9, 6, 6, 9, 9, 5, 5, 5, 5, 9, 6,
6, 6, 5, 5, 5, 6, 6, 6, 5, 6, 5, 5, 6, 9, 6, 6, 5, 6, 6, 9, 5, 5,
9, 6, 9, 6, 5, 6, 5, 9, 6, 5, 6, 6, 5, 6, 6, 9, 6, 9, 5, 9, 6, 5,
6, 6, 6, 9, 5, 5, 6, 5, 9, 5, 6, 6, 9, 9, 6, 5, 5, 6, 6, 6, 5, 6,
5, 6, 5, 9, 5, 5, 5, 9, 6, 6, 5, 6, 9, 5, 6, 6, 6, 9, 6, 6, 5, 6,
9, 6, 5, 6, 6, 5, 9, 6, 5, 5, 6, 6, 5, 9, 6, 6, 5, 9, 6, 5, 5, 5,
5, 6, 6, 6, 9, 5, 5, 5, 6, 5], dtype=int64)
print(classification_report(y_test,dtc_prediction ))
precision recall f1-score support
5 0.58 0.56 0.57 145
6 0.46 0.50 0.48 129
9 0.39 0.36 0.38 66
accuracy 0.50 340
macro avg 0.48 0.47 0.47 340
weighted avg 0.50 0.50 0.50 340
### Visualising the Decision Tree
from sklearn import tree
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(25,20))
clf = dtc.fit(X_train, y_train)
tree.plot_tree(clf, filled=True)
plt.show();
fig.savefig("decision tree.png")
log_reg=LogisticRegression()
log_reg
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
log_reg.fit(X_train1, y_train1)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
log_reg_pred=log_reg.predict(X_test1)
log_reg_pred
array([6, 5, 6, 5, 6, 6, 5, 6, 6, 9, 5, 5, 6, 5, 5, 6, 6, 6, 6, 6, 5, 5,
6, 5, 6, 5, 5, 6, 5, 6, 6, 6, 6, 5, 5, 6, 6, 5, 6, 5, 5, 6, 6, 5,
5, 5, 5, 6, 5, 6, 5, 5, 5, 5, 5, 6, 5, 9, 5, 6, 5, 6, 9, 6, 5, 9,
5, 5, 6, 5, 5, 5, 6, 6, 6, 6, 6, 5, 5, 5, 6, 5, 5, 5, 5, 6, 5, 6,
5, 6, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 6, 6, 6, 5, 5,
5, 5, 5, 5, 5, 5, 6, 6, 5, 6, 5, 6, 5, 6, 9, 6, 5, 5, 5, 5, 6, 5,
5, 5, 6, 5, 5, 5, 9, 6, 6, 5, 6, 6, 5, 5, 5, 6, 5, 6, 5, 6, 6, 6,
6, 9, 5, 9, 5, 5, 5, 6, 5, 5, 5, 5, 6, 5, 5, 5, 6, 6, 6, 5, 5, 9,
9, 6, 6, 5, 9, 5, 9, 5, 5, 6, 5, 6, 6, 5, 6, 6, 5, 5, 6, 6, 6, 5,
6, 6, 6, 9, 5, 6, 6, 6, 6, 5, 6, 6, 6, 5, 6, 6, 5, 5, 5, 5, 9, 6,
5, 6, 5, 5, 5, 6, 6, 6, 6, 9, 5, 5, 5, 6, 6, 5, 5, 5, 5, 5, 5, 5,
5, 6, 6, 6, 5, 9, 6, 5, 6, 5, 6, 6, 6, 6, 6, 9, 6, 9, 6, 6, 6, 6,
6, 6, 6, 9, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 6, 6, 5, 6,
5, 6, 5, 6, 5, 5, 9, 6, 6, 6, 5, 6, 6, 9, 6, 6, 5, 6, 5, 6, 5, 6,
6, 5, 5, 6, 6, 5, 6, 5, 6, 5, 5, 6, 5, 6, 5, 6, 6, 5, 6, 5, 6, 5,
6, 6, 9, 6, 6, 5, 5, 5, 5, 6], dtype=int64)
print(classification_report(y_test1, log_reg_pred))
precision recall f1-score support
5 0.63 0.69 0.66 145
6 0.50 0.62 0.56 129
9 0.52 0.18 0.27 66
accuracy 0.56 340
macro avg 0.55 0.50 0.50 340
weighted avg 0.56 0.56 0.54 340
svc=SVC()
svc
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC()
svc.fit(X_train1, y_train1)
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC()
svc_pred=svc.predict(X_test1)
svc_pred
array([9, 5, 6, 5, 5, 5, 5, 6, 6, 9, 5, 5, 6, 5, 5, 6, 5, 6, 9, 6, 5, 6,
6, 5, 6, 5, 5, 6, 5, 6, 9, 6, 6, 5, 5, 6, 5, 5, 6, 5, 5, 6, 6, 5,
5, 5, 5, 6, 5, 6, 5, 5, 5, 5, 5, 6, 5, 9, 5, 6, 5, 6, 9, 6, 5, 6,
5, 5, 6, 5, 5, 5, 6, 6, 6, 5, 6, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 9,
5, 6, 5, 5, 5, 5, 6, 5, 5, 5, 5, 6, 6, 5, 5, 5, 5, 6, 6, 6, 5, 5,
5, 5, 5, 5, 5, 5, 6, 6, 5, 6, 5, 6, 5, 6, 9, 6, 5, 6, 5, 5, 6, 6,
5, 5, 9, 5, 5, 5, 5, 9, 6, 5, 9, 6, 5, 5, 5, 6, 5, 6, 6, 6, 6, 5,
6, 6, 5, 5, 6, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 5, 5, 6,
6, 6, 6, 5, 6, 5, 6, 5, 6, 6, 5, 6, 6, 5, 6, 5, 5, 5, 9, 6, 9, 5,
6, 6, 6, 9, 5, 5, 6, 6, 6, 5, 6, 6, 6, 5, 6, 6, 5, 5, 5, 5, 6, 6,
5, 6, 5, 5, 5, 6, 6, 6, 6, 6, 5, 5, 5, 6, 6, 5, 5, 5, 5, 5, 5, 5,
5, 6, 6, 6, 5, 6, 5, 5, 6, 6, 9, 6, 6, 9, 5, 6, 6, 9, 6, 6, 5, 5,
6, 6, 6, 9, 5, 6, 6, 6, 6, 6, 6, 6, 9, 6, 6, 6, 6, 9, 5, 6, 5, 6,
5, 6, 5, 6, 5, 5, 5, 9, 6, 6, 5, 6, 9, 5, 6, 6, 5, 6, 6, 6, 5, 6,
6, 5, 5, 9, 6, 5, 5, 5, 6, 5, 5, 6, 5, 6, 6, 6, 6, 5, 5, 5, 6, 5,
6, 9, 6, 6, 6, 5, 5, 5, 5, 5], dtype=int64)
print(classification_report(y_test1, svc_pred))
precision recall f1-score support
5 0.64 0.75 0.69 145
6 0.54 0.60 0.57 129
9 0.62 0.23 0.33 66
accuracy 0.59 340
macro avg 0.60 0.53 0.53 340
weighted avg 0.60 0.59 0.57 340
rand_for=RandomForestClassifier()
rand_for
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier()
rand_for.fit(X_train, y_train)
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier()
rand_for_pred=rand_for.predict(X_test)
rand_for_pred
array([6, 6, 6, 5, 9, 5, 6, 6, 5, 9, 5, 5, 6, 5, 5, 6, 6, 5, 9, 6, 5, 6,
6, 5, 6, 5, 5, 6, 5, 5, 9, 6, 6, 5, 5, 6, 5, 5, 6, 5, 5, 6, 6, 5,
5, 5, 6, 6, 5, 6, 6, 5, 5, 5, 6, 6, 5, 9, 5, 6, 5, 5, 9, 5, 5, 6,
5, 5, 6, 5, 5, 5, 6, 6, 6, 5, 6, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 9,
5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 6, 6, 5, 5, 5, 6, 6, 6, 6, 5, 5,
5, 5, 6, 5, 5, 6, 6, 6, 5, 6, 5, 9, 5, 6, 9, 6, 5, 6, 5, 5, 6, 5,
5, 5, 9, 5, 5, 5, 5, 9, 5, 5, 9, 6, 5, 6, 5, 6, 5, 6, 5, 6, 6, 6,
5, 6, 5, 5, 6, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 6, 5, 6, 6, 5, 5, 6,
9, 6, 9, 5, 5, 6, 6, 5, 5, 5, 5, 6, 6, 5, 6, 5, 5, 5, 9, 9, 9, 5,
6, 6, 6, 5, 5, 6, 6, 6, 6, 5, 6, 6, 6, 5, 6, 9, 5, 5, 5, 5, 6, 6,
5, 6, 5, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 5, 5, 5, 9, 5, 5,
5, 6, 9, 6, 5, 9, 6, 5, 6, 5, 6, 6, 9, 9, 6, 9, 6, 9, 6, 9, 5, 9,
5, 6, 6, 9, 5, 9, 6, 6, 9, 6, 6, 6, 9, 6, 6, 6, 6, 9, 6, 6, 5, 6,
5, 6, 5, 5, 5, 5, 5, 6, 9, 6, 5, 6, 9, 5, 6, 6, 5, 9, 6, 6, 5, 5,
5, 5, 5, 6, 6, 5, 5, 6, 6, 5, 6, 6, 6, 9, 9, 6, 6, 5, 5, 5, 6, 5,
6, 9, 6, 6, 6, 5, 6, 5, 5, 5], dtype=int64)
print(classification_report(y_test, rand_for_pred))
precision recall f1-score support
5 0.64 0.72 0.68 145
6 0.54 0.58 0.56 129
9 0.47 0.27 0.35 66
accuracy 0.58 340
macro avg 0.55 0.53 0.53 340
weighted avg 0.57 0.58 0.57 340